Setup:
knitr::opts_chunk$set(echo = TRUE, max.print = 100)
Load libraries:
library(readxl)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringi)
library(Cairo)
library(ape)
library(geosphere)
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(ggrepel)
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:ape':
##
## rotate
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Load data:
Phonotacticon <- read_xlsx("Phonotacticon.xlsx", guess_max = 1661)
PhonoBib <- read_xlsx("PhonoBib.xlsx")
PanPhon <- read_csv("PanPhonPhonotacticon1_0.csv") %>%
filter(!duplicated(ipa))
## Rows: 7398 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ipa
## dbl (22): syl, son, cons, cont, delrel, lat, nas, strid, voi, sg, cg, ant, c...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Phonotacticon
PhonoBib
PanPhon
Create PanPhon regular expression:
PanPhonOrder <- PanPhon$ipa[order(-nchar(PanPhon$ipa), PanPhon$ipa)]
PanPhonRegex <- paste0("(?:", paste(PanPhonOrder, collapse="|"), ")")
str_trunc(PanPhonRegex, 100)
## [1] "(?:h͡d̪͡ɮ̪ʲʷ|h͡d̪͡ɮ̪ʷː|h͡d̪͡ɮ̪ʷˠ|h͡d̪͡ɮ̪ʷˤ|h͡d̪͡z̪ʲʷ|h͡d̪͡z̪ʷː|h͡d̪͡z̪ʷˠ|h͡d̪͡z̪ʷˤ|h͡t̪͡ɬ̪ʲʷ|h͡t̪..."
Subset complete Eurasian lects out of Phonotacticon:
Eurasia <- Phonotacticon %>%
filter(Macroarea == "Eurasia",
Complete,
complete.cases(P),
O != "?",
N != "?",
C != "?",
!grepl("[A-Z]|\\[", O),
!grepl("[A-Z]|\\[", N),
!grepl("[A-Z]|\\[", C))
Eurasia
Extract onset, nucleus, and coda sequences:
Sequences <- Eurasia %>%
select(Lect, O, N, C) %>%
pivot_longer(-Lect, names_to = 'Category', values_to = 'Sequence') %>%
separate(col = Sequence,
sep = ' ',
into = as.character(1:500),
fill = 'right') %>%
pivot_longer(-c(Lect, Category), names_to = 'Number', values_to = 'Sequence') %>%
select(-Number) %>%
filter(!is.na(Sequence)) %>%
distinct()
Sequences
Split the sequences into segments:
Segments <- stri_extract_all_regex(Sequences$Sequence,
pattern = PanPhonRegex,
simplify = TRUE) %>%
as_tibble(.name_repair = 'unique') %>%
mutate(Lect = Sequences$Lect,
Category = Sequences$Category,
Sequence = Sequences$Sequence) %>%
pivot_longer(cols = -c(Lect, Category, Sequence),
names_to = 'Order',
values_to = 'ipa') %>%
mutate(Order = Order %>%
as.factor() %>%
as.integer()) %>%
filter(ipa != "")
## New names:
## • `` -> `...1`
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
Segments
Measure the length of each sequence:
Sequences_length <- Segments %>%
count(Lect, Category, Sequence,
name = 'Length')
Sequences_length
Join the length of each sequence to segments:
Segments <- left_join(Segments, Sequences_length)
## Joining, by = c("Lect", "Category", "Sequence")
Segments
Count the maximal length:
MaxLength <- max(Sequences_length$Length)
MaxLength
## [1] 5
Count the number of split segments:
Segments_number <- nrow(Segments)
Segments_number
## [1] 7443
Assign different positions to each sequence:
Sequences_rep <- bind_rows(rep(list(Segments), 5))
Sequences_rep <- Sequences_rep %>%
mutate(Position = rep(0:(MaxLength - 1),
each = Segments_number)) %>%
mutate(Order = Order + Position) %>%
filter(Length + Position <= MaxLength) %>%
select(-Length)
Sequences_rep
Join segments with their phonological features:
Sequences_features <- Sequences_rep %>%
left_join(PanPhon, by = 'ipa') %>%
pivot_longer(cols = -c(Lect,
Category,
Sequence,
Order,
ipa,
Position),
names_to = 'Feature',
values_to = 'Value') %>%
mutate(Feature = paste0(Feature, Order)) %>%
pivot_wider(names_from = 'Feature',
values_from = 'Value') %>%
select(-Order, -ipa) %>%
pivot_longer(cols = -c(Lect, Category, Sequence, Position),
names_to = 'Feature',
values_to = 'Value') %>%
filter(Value != 'NULL') %>%
pivot_wider(names_from = 'Feature',
values_from = 'Value') %>%
replace(is.na(.), 0)
Sequences_features
Paste sequence and position together:
Sequences_SeqPos <- Sequences_features %>%
mutate(SequencePosition = paste0(Sequence, Position)) %>%
select(-Lect, -Category, -Position, -Sequence) %>%
select(SequencePosition, everything()) %>%
distinct()
Sequences_SeqPos
Calculate the distance between segments:
Sequences_distance <- Sequences_SeqPos %>%
select(-SequencePosition) %>%
dist(method = 'euclidean') %>%
as.matrix() %>%
as_tibble(.name_repair = 'unique')
Sequences_distance <- bind_cols(Sequences_SeqPos$SequencePosition,
Sequences_distance)
## New names:
## • `` -> `...1`
colnames(Sequences_distance) <- c('SequencePosition',
Sequences_SeqPos$SequencePosition)
Sequences_distance
Find the minimal distance among the different positions:
Sequences_MinDistance <- Sequences_distance %>%
pivot_longer(-SequencePosition,
names_to = 'Sequence.y',
values_to = 'Distance') %>%
mutate(Sequence.x = str_remove(SequencePosition, '[0-9]')) %>%
select(-SequencePosition) %>%
mutate(Sequence.y = str_remove(Sequence.y, '[0-9]')) %>%
group_by(Sequence.x, Sequence.y) %>%
summarise(Distance = min(Distance))
## `summarise()` has grouped output by 'Sequence.x'. You can override using the
## `.groups` argument.
Sequences_MinDistance
Subset the distance of /pl/ and other segments:
pl <- Sequences_MinDistance %>%
filter(Sequence.x == 'pl',
Sequence.y != '∅') %>%
arrange(Distance)
pl
Subset the distance of /ia/ and other segments:
ia <- Sequences_MinDistance %>%
filter(Sequence.x == 'ia',
Sequence.y != '∅') %>%
arrange(Distance)
ia
Compare the difference between two lects within the same category:
ONC_distance <- Sequences %>%
left_join(Sequences, by = c('Category')) %>%
left_join(Sequences_MinDistance,
by = c('Sequence.x', 'Sequence.y')) %>%
group_by(Lect.x, Category, Sequence.x, Lect.y) %>%
summarize(Distance = min(Distance)) %>%
group_by(Lect.x, Category, Lect.y) %>%
summarize(Distance = mean(Distance)) %>%
mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
"vs.",
pmax(Lect.x, Lect.y),
sep = " ")) %>%
group_by(Lect_vs_Lect, Category) %>%
mutate(Distance = max(Distance)) %>%
ungroup() %>%
select(Lect_vs_Lect, Category, Distance) %>%
distinct()
## `summarise()` has grouped output by 'Lect.x', 'Category', 'Sequence.x'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Lect.x', 'Category'. You can override
## using the `.groups` argument.
ONC_distance
Count the number of tones in each lect:
Tones <- Eurasia %>%
select(Lect, `T`) %>%
mutate(`T` = gsub("\\-", NA, `T`)) %>%
mutate(`T` = str_count(`T`, " ") + 1)
Tones[is.na(Tones$`T`),]$`T` <- 0
Tones
Calculate the Canberra distance between the numbers of tones of each pair of lects:
Tones_distance <- dist(Tones, method = 'canberra')
## Warning in dist(Tones, method = "canberra"): NAs introduced by coercion
Tones_distance[is.na(Tones_distance)] <- 0
Tones_distance <- Tones_distance %>%
as.matrix() %>%
as_tibble(.name_repair = 'unique')
Tones_distance <- bind_cols(Tones$Lect, Tones_distance)
## New names:
## • `` -> `...1`
colnames(Tones_distance) <- c('Lect', Tones$Lect)
Tones_distance <- Tones_distance %>%
pivot_longer(-Lect, names_to = 'Lect2', values_to = 'Distance') %>%
mutate(Lect_vs_Lect = str_c(pmin(Lect, Lect2),
"vs.",
pmax(Lect, Lect2),
sep = " "),
Category = 'T') %>%
select(Lect_vs_Lect, Category, Distance) %>%
distinct()
Tones_distance
Bind segmental and tonal distances:
ONCT_distance <- rbind(ONC_distance,
Tones_distance) %>%
pivot_wider(names_from = Category,
values_from = Distance)
ONCT_distance
Normalize the distances:
ONCT_distance[, c('O', 'N', 'C', 'T')] <-
scale(ONCT_distance[, c('O', 'N', 'C', 'T')])
ONCT_distance
Calculate the mean of the distances and obtain the phonological distances:
PhonoDist <- ONCT_distance %>%
mutate(Distance =
rowMeans(ONCT_distance[, c('O', 'N', 'C', 'T')])) %>%
mutate(Distance = Distance - min(Distance)) %>%
select(Lect_vs_Lect, Distance)
PhonoDist
Split the Lect_vs_Lect column of PhonoDist into Lect.x and Lect.y:
Lect_vs_Lect <- str_split_fixed(
PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>%
as_tibble(.name_repair = 'unique')
## New names:
## • `` -> `...1`
## • `` -> `...2`
colnames(Lect_vs_Lect) <- c('Lect.x', 'Lect.y')
Lect_vs_Lect
Visualize the phonological distances via multidimensional scaling:
PhonoScale <- PhonoDist %>%
bind_cols(Lect_vs_Lect) %>%
select(-Lect_vs_Lect) %>%
pivot_wider(names_from = Lect.y,
values_from = Distance) %>%
select(-Lect.x) %>%
t() %>%
as.dist() %>%
cmdscale() %>%
as.data.frame() %>%
rownames_to_column('Lect') %>%
as_tibble()
PhonoScale
Use the average silhouette method:
Silhouette <- PhonoScale %>%
select(-Lect) %>%
fviz_nbclust(kmeans, method = 'silhouette')
Silhouette
Find the optimal number of clusters:
ClusterNumber <- Silhouette$data %>%
as_tibble %>%
filter(y == max(y)) %>%
select(-y) %>%
as.numeric()
ClusterNumber
## [1] 2
Cluster PhonoScale into groups:
K <- PhonoScale %>%
select(-Lect) %>%
kmeans(ClusterNumber)
PhonoScale$group <- as.factor(K$cluster)
PhonoScale
Visualize the multidimensional scaling:
PhonoScalePlot <- ggplot(PhonoScale) +
geom_point(aes(x = V1,
y = V2,
shape = group,
color = group),
show.legend = FALSE) +
geom_label_repel(aes(x = V1,
y = V2,
label = Lect,
fill = group),
max.overlaps = 500,
min.segment.length = 0,
size = 2,
show.legend = FALSE)
cairo_pdf(file = 'PhonoScalePlot.pdf',
width = 7,
height = 7,
family = 'Times New Roman')
PhonoScalePlot
dev.off()
## quartz_off_screen
## 2
PhonoScalePlot
Calculate the geographical distances between each pair of lects:
Lect_LonLat <- PhonoBib %>%
filter(Lect %in% Eurasia$Lect) %>%
select(Lect, lon, lat)
Lect_Kilometers <- Lect_LonLat %>%
mutate(Kilometers = 'Kilometers')
Coordinates <- Lect_Kilometers %>%
full_join(Lect_Kilometers, by = 'Kilometers')
Coordinates <- Coordinates %>%
mutate(Lect_vs_Lect = str_c(pmin(Lect.x, Lect.y),
"vs.",
pmax(Lect.x, Lect.y),
sep = " "))
Coordinates.x <- select(Coordinates, lon.x, lat.x)
Coordinates.y <- select(Coordinates, lon.y, lat.y)
GeoDist <- Coordinates %>%
mutate(Kilometers =
distHaversine(Coordinates.x, Coordinates.y) / 1000) %>%
select(Lect_vs_Lect, Kilometers) %>%
distinct()
GeoDist
Join the phonological distances and geographical distances:
PhonoGeoDist <- full_join(PhonoDist, GeoDist, by = 'Lect_vs_Lect')
PhonoGeoDist
Filter out the distance of a lect from itself:
PhonoGeoDist <- PhonoGeoDist %>%
filter(!(grepl('(.*) vs\\. \\1', Lect_vs_Lect)))
Linear regression between geographical distance and phonological distances:
PhonoGeoDist %>%
lm(formula = Distance ~ Kilometers) %>%
summary()
##
## Call:
## lm(formula = Distance ~ Kilometers, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.72676 -0.34641 0.03341 0.36328 1.48456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.754e+00 2.030e-02 86.43 <2e-16 ***
## Kilometers 9.211e-05 4.720e-06 19.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5054 on 1706 degrees of freedom
## Multiple R-squared: 0.1825, Adjusted R-squared: 0.182
## F-statistic: 380.9 on 1 and 1706 DF, p-value: < 2.2e-16
Visualize the linear regression:
PhonoLM <-
ggplot(aes(x = Kilometers, y = Distance), data = PhonoGeoDist) +
geom_point(alpha = 0.1) +
geom_smooth(formula = y ~ x, method = 'lm', color = 'red') +
theme_classic() +
scale_x_continuous(name = 'Geographical distance (km)') +
scale_y_continuous(name = 'Phonological distance (z)')
cairo_pdf(file = 'PhonoLM.pdf',
width = 4,
height = 3,
family = 'Times New Roman')
PhonoLM
dev.off()
## quartz_off_screen
## 2
PhonoLM